Anatomy of Facebook by Facebook Data Science Team
Seeking safety by The Economist Data Team
install.packages("ggplot2")
install.packages("lattice")faster (though only noticeable over many and large plots)
simpler (at first)
better at trellis graphs
able to do 3d graphs
generally more elegant
more syntactically logical (and therefore simpler, once you learn it)
better at grouping
able to interface with maps
well documented [http://docs.ggplot2.org/current/]
library(lattice)The general call for lattice graphics looks something like this:
graph_type(formula, data=, [options])The specifics of the formula differ for each graph type, but the general format is straightforward
y # Show the distribution of y
y~x # Show the relationship between x and y
y~x|A # Show the relationship between x and y conditional on the values of A
y~x|A*B # Show the relationship between x and y conditional on the combinations of A and B
z~y*x # Show the 3D relationship between x, y, and zlibrary(ggplot2)The general call for ggplot2 graphics looks something like this:
ggplot(data=, aes(x=,y=, [options])) + geom_xxxx() + ... + ... + ...Note that ggplot2 graphs in layers in a continuing call (hence the endless +…+…+…), which really makes the extra layer part of the call
...+geom_xxxx(data=, aes(x=,y=,[options]),[options])+...+...+...You can see the layering effect by comparing the same graph with different colors for each layer
ggplot(data=data, aes(x=year, y=realgdpgr)) + geom_point(color="black") + geom_point(aes(x=year, y=unemp), color="red")
ggplot(data=data, aes(x=year, y=realgdpgr)) + geom_point(color="red") + geom_point(aes(x=year, y=unemp), color="black")
ggplot(data=data, aes(x=year, y=outlays)) + geom_point() # ggplot2
xyplot(outlays~year, data=data) # lattice
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + geom_line() # ggplot2
xyplot(outlays~year, data=data[data$country=="USA",], type="l") # lattice
ggplot(data=data, aes(x=country, y=outlays)) + geom_boxplot() # ggplot2
bwplot(outlays~country, data=data) # lattice
# Create data.frame of average growth rates by country over time
library(plyr)
growth <- ddply(.data=data, .variables=.(country), summarize, mean=mean(realgdpgr, na.rm=T))
ggplot(data=growth, aes(x=country, y=mean)) + geom_bar(stat="identity") # ggplot2
barchart(mean~country, data=growth) # lattice
ggplot(data=data, aes(x=vturn)) + geom_density() # ggplot2
densityplot(~vturn, data=data) # lattice
data(volcano) # Load volcano contour data
volcano[1:10, 1:10] # Examine volcano dataset (first 10 rows and columns)## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 100 100 101 101 101 101 101 100 100 100
## [2,] 101 101 102 102 102 102 102 101 101 101
## [3,] 102 102 103 103 103 103 103 102 102 102
## [4,] 103 103 104 104 104 104 104 103 103 103
## [5,] 104 104 105 105 105 105 105 104 104 103
## [6,] 105 105 105 106 106 106 106 105 105 104
## [7,] 105 106 106 107 107 107 107 106 106 105
## [8,] 106 107 107 108 108 108 108 107 107 106
## [9,] 107 108 108 109 109 109 109 108 108 107
## [10,] 108 109 109 110 110 110 110 109 109 108
volcano3d <- reshape2::melt(volcano) # Use reshape2 package to melt the data
head(volcano3d) # Examine volcano3d dataset (head)## Var1 Var2 value
## 1 1 1 100
## 2 2 1 101
## 3 3 1 102
## 4 4 1 103
## 5 5 1 104
## 6 6 1 105
names(volcano3d) <- c("xvar", "yvar", "zvar") # Rename volcano3d columns
ggplot(data=volcano3d, aes(x=xvar, y=yvar, z = zvar)) + geom_contour() # ggplot2
contourplot(zvar~xvar + yvar, data=volcano3d) # lattice
ggplot(data=volcano3d, aes(x=xvar, y=yvar, z = zvar)) + geom_tile(aes(fill=zvar)) # ggplot2
levelplot(zvar~xvar + yvar, data=volcano3d) # lattice
# Create a subset of the dataset containing only data for France
france.data <- data[data$country=="France",]
cloud(outlays~year*realgdpgr, data=france.data)
# Create a subset of the dataset containing only data for Greece, Portugal, Ireland, and Spain
pigs.data <- data[data$country %in% c("Greece", "Portugal", "Ireland", "Spain"),]
cloud(outlays~year*realgdpgr|country, data=pigs.data)
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point()
# Add linear model (lm) smoother
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() +
geom_smooth(method="lm")
# Add local linear model (loess) smoother, span of 0.75
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() +
geom_smooth(method="loess", span=.75)
# Add local linear model (loess) smoother, span of 0.25
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() +
geom_smooth(method="loess", span=.25)
# Add linear model (lm) smoother, no standard error shading
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() +
geom_smooth(method="lm", se=F)
# Add local linear model (loess) smoother, no standard error shading
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() +
geom_smooth(method="loess", se=F)
# Add a local linear (loess) smoother for each country
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(color=country)) +
geom_smooth(aes(color=country))
# Add a local linear (loess) smoother for each country, no standard error shading
ggplot(data=pigs.data, aes(x=year, y=outlays)) +
geom_point(aes(color=country, size=realgdpgr)) +
geom_smooth(aes(color=country), se=F)
ggplot(data=data, aes(x=year, y=outlays)) + geom_point() +
xlab(label="Voter Turnout (%)") + ylab(label="Government Outlays") +
ggtitle(label="Cool Graph") # ggplot2
xyplot(outlays~year, data=data, xlab="Year", ylab="Government Outlays", main
="Cool Graph") # lattice
ggplot(data=data, aes(x=year, y=outlays)) + geom_point() + facet_wrap(~country) # ggplot2
xyplot(outlays~year|country, data=data) # lattice
# Initialize gridExtra library
library(gridExtra)
# Create 3 plots to combine in a table
plot1 <- ggplot(data=pigs.data, aes(x=year, y=outlays, color=)) +
geom_line(aes(color=country))
plot2 <- ggplot(data=pigs.data, aes(x=year, y=outlays, linetype=)) +
geom_line(aes(linetype=country))
plot3 <- ggplot(data=pigs.data, aes(x=year, y=outlays, shape=)) +
geom_point(aes(shape=country))
# Call grid.arrange
grid.arrange(plot1, plot2, plot3, nrow=3, ncol=1)
Two basic image types
Every pixel of a plot contains its own separate coding; not so great if you want to resize the image
jpeg(filename="example.png", width=, height=)
plot(x,y)
dev.off()Every element of a plot is encoded with a function that gives its coding conditional on several factors; great for resizing
pdf(filename="example.pdf", width=, height=)
plot(x,y)
dev.off()# Assume we saved our plot is an object called example.plot
# lattice
trellis.device(device="pdf", filename="example.pdf")
print(example.plot)
dev.off()
# ggplot2
ggsave(filename="example.pdf", plot=example.plot, scale=, width=, height=) # ggplot2By now, you might be noticing some trends in how these two packages approach graphics
lattice tends to focus on a particular type of graph and how to represent cross-sectional variation by splitting it up into smaller chunks
Becoming a proficient user of lattice requires learning a huge array of graph-specific formulas and options
ggplot2 tries to represent much more of the cross-sectional variation by making use of various “aesthetics”; general approach is based on The Grammar of Graphics
Basic idea is that the visualization of all data requires four items
One or more statistics conveying information about the data (identities, means, medians, etc.)
A coordinate system that differentiates between the intersections of statistics (at most two for ggplot, three for lattice)
Geometries that differentiate between off-coordinate variation in kind
Scales that differentiate between off-coordinate variation in degree
The National Hurricane Center (NHC) collects datasets with all storms around the globe.
For all storms, we have the location of the storm, every six hours (at midnight, six a.m., noon and six p.m.). Note that we have also the date, the maximal wind speed – on a 6 hour window – and the pressure in the eye of the storm. Based on the location of the storm, we also record the “Tropical basin”, a geological sub-group, for each storm.
For all major storms falling between 1989 to 2013, I add information on deaths and economic damages for major hurricanes over that period from Wikipedia. The information is stored in “AtlanticHurricanes.rda”.
You can download the data files at [https://www.ocf.berkeley.edu/~wcai/share/hurricanes/]
# Load packages
library(ggplot2)
# Load data without unziping
library(data.table)
filename <- "./hurricanes/hurricane.csv.bz2"
con <- bzfile(filename, open = 'r')
dt <- read.csv(con, na.strings = '', stringsAsFactors = FALSE)
# (optional) turn into data.table object
dt <- as.data.table(dt)
summary(dt)## V1 Season Num Basin
## Length:327253 Min. :1842 Min. : 1.00 Length:327253
## Class :character 1st Qu.:1938 1st Qu.: 5.00 Class :character
## Mode :character Median :1969 Median :10.00 Mode :character
## Mean :1961 Mean :11.89
## 3rd Qu.:1991 3rd Qu.:16.00
## Max. :2014 Max. :61.00
## Sub_basin Name ISO_time
## Length:327253 Length:327253 Length:327253
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Nature Latitude Longitude Wind.kt
## Length:327253 Min. :-999.0 Min. :-999.0 Min. :-999.0
## Class :character 1st Qu.:-999.0 1st Qu.:-999.0 1st Qu.:-999.0
## Mode :character Median : -17.1 Median :-100.2 Median : 0.0
## Mean :-413.0 Mean :-398.7 Mean :-400.1
## 3rd Qu.: 18.2 3rd Qu.: 103.0 3rd Qu.: 35.0
## Max. : 72.5 Max. : 180.0 Max. : 165.0
## Pressure.mb V13 V14 V15
## Min. :-999.00 Length:327253 Min. :-999.00 Min. :-999.00
## 1st Qu.:-999.00 Class :character 1st Qu.:-999.00 1st Qu.:-999.00
## Median : 0.00 Mode :character Median :-100.00 Median :-100.00
## Mean : -21.23 Mean :-421.67 Mean :-418.37
## 3rd Qu.: 990.00 3rd Qu.: 32.42 3rd Qu.: 33.27
## Max. :9999.00 Max. : 100.00 Max. : 100.00
## V16 Degrees_North Deegrees_East
## Length:327253 Min. :-68.500 Min. :-180.00
## Class :character 1st Qu.:-14.100 1st Qu.: 39.63
## Mode :character Median : 14.200 Median : 92.40
## Mean : 7.452 Mean : 61.81
## 3rd Qu.: 22.000 3rd Qu.: 132.11
## Max. : 72.500 Max. : 180.00
Filter out missing data and unnamed hurricanes
# Data cleaning
myNames <- c("Season",
"Num", "Basin", "Sub_basin", "Name", "ISO_time", "Nature",
"Latitude", "Longitude", "Wind.kt", "Pressure.mb",
"Degrees_North", "Deegrees_East")
dt <- dt[Latitude > -999 & Longitude > -999 & Wind.kt>0 &
!Name %in% c("UNNAMED", "NONAME:UNNAMED"),
myNames, with=FALSE]Remove white space, add storm ID and add months
myMonth <- function(x){
time.date = strsplit(x, " ")
iso.date = unlist(lapply(time.date, function(x) x[1]))
iso.month = substr(iso.date, 6, 7)
factor(iso.month, labels = c(month.name))
}
dt[, `:=`(Basin = gsub(" ", "", Basin),
ID = paste(Name, Season, sep = "."),
Month = myMonth(ISO_time))]Explore the distribution of the data
plot(table(dt$Season))table(dt$Basin)##
## EP NA NI SA SI SP WP
## 22136 22054 3466 61 17597 9019 22039
PLUS: 10 tips for making your R graphics look their best (by Revolution R)
http://blog.revolutionanalytics.com/2009/01/10-tips-for-making-your-r-graphics-look-their-best.html
Data: [https://www.ocf.berkeley.edu/~wcai/share/airlines/AirlineDataAll.csv]
You’re welcome to try out either lattice or ggplot for these questions, but in the solutions we’ll focus on the ggplot approach.
For some of these you may want to use a smaller version of the dataset, such as a random subset, subset <- air[sample(1:nrow(air), 10000, replace = FALSE), ].
Plot a histogram of the flight delays with negative delays set to zero, censoring delay times at a maximum of 60 minutes.
Plot the arrival delay against the departure delay as a scatterplot.
Clean up your scatterplot with a title and axis labels. Output it as a PDF and see if you’d be comfortable with including it in a report/paper.
Make a boxplot of the departure delay as a function of the day of week.
Create a trellis plot of departure delay boxplots, one per destination for this subset of destinations, DestSubset <- c('LAX','SEA','PHX','DEN','MSP','JFK','ATL','DFW','IAH', 'ORD'). Use a 2x5 layout of panels in the plot.
Subset the data to flights going to Chicago (ORD) and Houston (IAH). Plot arrival delay against scheduled departure time (CRSDepTime). Now plot so that flights to Chicago are in one color and those to Houston in another. Use scale_x_continuous() and scale_y_continuous() to set the x-axis limits to be in the range from 6 am to midnight and the y-axis limits to be in the range (-10, 120).
ggplot(data=data, aes(x=year, y=outlays)) + geom_point() # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(size=3) # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(size=1) # ggplot2
xyplot(outlays~year, data=data) # lattice
xyplot(outlays~year, data=data, cex=2) # lattice
xyplot(outlays~year, data=data, cex=.5) # lattice
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(color=colors()[145]) # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(color="red") # ggplot2
xyplot(outlays~year, data=data, col=colors()[145]) #lattice
xyplot(outlays~year, data=data, col="red") #lattice
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=3) # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=15) # ggplot2
xyplot(outlays~year, data=data, pch=3) # lattice
xyplot(outlays~year, data=data, pch=15) # lattice
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=3) # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=15) # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape="w") # ggplot2
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape="$", size=5) # ggplot2
xyplot(outlays~year, data=data, pch=3) # lattice
xyplot(outlays~year, data=data, pch=15) # lattice
xyplot(outlays~year, data=data, pch="w") # lattice
xyplot(outlays~year, data=data, pch="$", cex=2) # lattice
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=1) # ggplot2
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=2) # ggplot2
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=3) # ggplot2
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=3, size=1) # ggplot2
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=3, size=1.5) # ggplot2
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) +
geom_line(linetype=3, size=2) # ggplot2
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=1) # lattice
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=2) # lattice
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3) # lattice
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=2) # lattice
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=3) # lattice
xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=4) # lattice
ggplot(data=, aes(x=, y=, color=, linetype=, shape=, size=))ggplot2 is optimized for showing variation on all four aesthetic types
# Differences in kind using color
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(aes(color=country))
Note what happens when we specify the color parameter outside of the aesthetic operator. ggplot2 views these specifications as invalid graphical parameters.
# ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color=country)ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color="country")
## Error in grDevices::col2rgb(colour, TRUE) : invalid color name 'country'ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color="red")
# Differences in kind using line types
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(aes(linetype=country))
# Differences in kind using point shapes
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(shape=country))
# Differences in degree using color
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(color=realgdpgr))
# Differences in degree using point size
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(size=realgdpgr))
# Multiple non-cartesian aesthetics (differences in kind using color, degree using point size)
ggplot(data=pigs.data, aes(x=year, y=outlays)) +
geom_point(aes(color=country,size=realgdpgr))